R/library neale.R

Defines functions get_spss write_spss df_summary dummy_all recode_all auto_cluster factor_analysis tab_summary

Documented in auto_cluster df_summary dummy_all factor_analysis get_spss recode_all tab_summary write_spss

#library(plyr)
#require(dplyr)
#require(tidyr)
#require(stringr)
#require(haven)
#require(survey)
#require(xlsx)
#require(ggplot2)
#options(OutDec= ".")
#options(stringsAsFactors = FALSE)

######################################
##### commands to remember

#macro like functions
#result <- eval(parse(text = paste("data",1)))
#exists('var')
#get('var')
#assign('var',4)

#paste
#dput(names(xtabs(~df$var)))

#clipboard
#read.table("clipboard", sep="\t",header = TRUE)
#write.table(data.frame, "clipboard", sep="\t", row.names=FALSE)
#readClipboard()
#writeClipboard(array)

#xlsx
# wb <- createWorkbook()
# wb <- loadWorkbook(file.xlsx)
# sheet <- createSheet(wb, sheetName='Resumo - Seg1')
# addDataFrame(df.seg, sheet,row.names=FALSE)
# saveWorkbook(wb, file=file.xlsx)

######################################
##### read spss data and labels

get_spss <- function(file){
  #' Load spss data with labels converted to factors
  #'
  #' This function loads an spss .sav file, and creates a dataframe with the description,
  #' type and labels of all variables in the data file. This function automatically applies
  #' as_factor to the data, converting all variables with labels in spss to factors in R.
  #'
  #' To convert labels to factors function \code{haven::as_factor(.,only_labelled = TRUE,ordered=TRUE)} is used.
  #' For details, see \code{?as_factor}.
  #'
  #' The dataframe with the description of all variables contains the following information:
  #' \itemize{
  #'  \item \strong{var}: Variable name in SPSS and R
  #'  \item \strong{name}: Variable label in SPSS
  #'  \item \strong{spss.format}: Variable format in SPSS
  #'  \item \strong{class}: Variable class in R
  #'  \item \strong{n.na}: Number of \code{NA}'s
  #'  \item \strong{n.nan}: Number of \code{NaN}'s
  #'  \item \strong{n.distinct}: Number of distinct values
  #'  \item \strong{n.labels}: Number of labels
  #'  \item \strong{labels}: All labels concatenated and separated by ';'
  #'  \item \strong{labels.1 thru labels.n}: each label is displayed separatly in a variable
  #' }
  #'
  #' @param file A characater string with the location of the spss file to be loaded.
  #' @return A list with two components:
  #' \itemize{
  #'  \item \strong{sav}\emph{(dataframe)}: with the actual data
  #'  \item \strong{vars}\emph{(dataframe)}: with the description of all variables
  #' }
  #'
  #' @examples
  #' spss.data <- get_spss(file)$sav
  #'
  #' spss <- get_spss(file)
  #' spss.data <- spss$sav
  #' spss.vars <- spss$vars

  df.orig <- haven::read_sav(file)

  ########################################
  #fix for NaN instead of NA problem
  # num.vars <- unlist(lapply(df.orig,function(x){is.numeric(x)}))
  # df.orig[,num.vars] <- lapply(df.orig[,num.vars],function(x){ x[is.nan(x)]<-NA;return(x)})
  # df.orig <- as.data.frame(df.orig)
  # char.vars <- unlist(lapply(df.orig,function(x){is.character(x) | is.factor(x)}))
  # df.orig[,char.vars] <- lapply(df.orig[,char.vars],function(x){ x[x == 'NaN']<-NA;return(x)})
  # df.orig <- as.data.frame(df.orig)

  df.orig <- as.data.frame(df.orig %>% purrr::map(function(x){y <- ifelse(is.nan(x)==TRUE,NA,x);attributes(y) <- attributes(x);return(y)}))
  ########################################
  ########################################

  vars <- data.frame(.id=names(df.orig))
  #vars.labels <- plyr::ldply(df.orig,'attr',which="label")
  vars.labels <- as.data.frame(purrr::map_chr(df.orig,~ifelse(length(attr(.,which="label")) == 0,"",attr(.,which="label"))))
  formato <- plyr::ldply(df.orig,'attr',which="format.spss")
  na <- plyr::ldply(df.orig,function(x){sum(is.na(x))})
  nan <- plyr::ldply(df.orig,function(x){sum(is.nan(x))})

  df <- df.orig
  df <- haven::as_factor(df,only_labelled = TRUE)
  n.labels <- plyr::ldply(df,function(x){length(levels(x))})
  labels <- plyr::ldply(df,function(x){paste(levels(x),collapse="; ")})
  distinct <- plyr::ldply(df,function(x){length(xtabs(~x,na.action = na.pass,drop.unused.levels = TRUE))})
  n <- max(as.numeric(n.labels$V1))

  # #convertendo de volta para string
  # if (getOption("stringsAsFactors") == FALSE){
  #   char.vars <- str_detect(formato$attr,"^A")
  #   df[,char.vars] <- sapply(df[,char.vars],as.character)
  # }
  classe <- plyr::ldply(df,function(x){paste(class(x),collapse="-")})

  df.vars <- cbind(vars,vars.labels)
  df.vars <- dplyr::left_join(df.vars,formato,by=".id")
  df.vars <- dplyr::left_join(df.vars,classe,by=".id")
  df.vars <- dplyr::left_join(df.vars,na,by=".id")
  df.vars <- dplyr::left_join(df.vars,nan,by=".id")
  df.vars <- dplyr::left_join(df.vars,distinct,by=".id")
  df.vars <- dplyr::left_join(df.vars,n.labels,by=".id")
  df.vars <- dplyr::left_join(df.vars,labels,by=".id")
  names(df.vars) <- c('var','name','spss.format','class','n.na','n.nan','n.distinct','n.labels','labels')
  df.vars[,paste0('label.',1:n)] <- stringr::str_split_fixed(df.vars$labels,'; ',n)

  return(list(sav=df,vars=df.vars))
}

write_spss <- function(df,file,max=255){
  #' Save spss data
  #'
  #' This function saves an spss '.sav' file removing known erros. String with length 0 are tranformed to " ". Also,
  #' all character string are truncated to max length of paramenter 'max' and [[:cntrl:]] string are removed.
  #'
  #' @param df The dataframe to be exported to sav format.
  #' @param file A characater string with the location of the spss file to be saved.
  #' @param max Maximum size of character strings.
  #' @return NULL
  #'
  #' @examples
  #' spss.data <- write_sav(df,file)

  df <- as.data.frame(purrr::map(df,function(x){
    z <- x
    if(is.character(x) == TRUE){
      z <- stringr::str_replace_all(stringr::str_trunc(z,max),'[[:cntrl:]]','')
      if (max(nchar(as.character(x)))==0){
        z <- rep(" ",length(x))
      }
    }
    attributes(z) <- attributes(x)
    return(z)
  }))

  ret <- haven::write_sav(df,file)
  return(ret)
}

######################################
##### dplyr::summarise dataframe

df_summary <- function(df,drop=FALSE){
  #' Summarises all variables in a dataframe
  #'
  #' This function calculates the following statistics for all variables in the dataframe:
  #' \itemize{
  #'  \item \strong{var}: Variable name
  #'  \item \strong{name}: Variable label
  #'  \item \strong{class}: Variable class
  #'  \item \strong{n.na}: Number of \code{NA}'s
  #'  \item \strong{n.nan}: Number of \code{NaN}'s
  #'  \item \strong{n.distinct}: Number of distinct values
  #'  \item \strong{n.labels}: Number of labels
  #'  \item \strong{labels}: All labels concatenated and separated by ';'
  #'  \item \strong{labels.1 thru labels.n}: each label is displayed separatly in a variable
  #' }
  #'
  #' @param df The dataframe to be summarised.
  #' @param drop If \code{droplevels(df)} should be run before summarizing the dataframe.
  #' This can make the orginal dataframe and the one analyzied differ. We
  #' recomend you run \code{droplevels(df)} before calling this function if needed.
  #' @return A \emph{(dataframe)} with the description of all variables.
  #'
  #' @examples
  #' summary <- df_summary(df)
  #'

  if (drop == TRUE){
    df <- droplevels(df)
    warning('Function droplevels was used. Analised DF may be different than you expect!')
  }

  vars <- plyr::ldply(df,'attr',which="label")
  classe <- plyr::ldply(df,function(x){paste(class(x),collapse="-")})
  na <- plyr::ldply(df,function(x){sum(is.na(x))})
  nan <- plyr::ldply(df,function(x){sum(is.nan(x))})
  n.labels <- plyr::ldply(df,function(x){length(levels(x))})
  labels <- plyr::ldply(df,function(x){paste(levels(x),collapse=";")})
  distinct <- plyr::ldply(df,function(x){length(xtabs(~x,na.action = na.pass))})
  n <- max(as.numeric(n.labels$V1))

  df.vars <- as.data.frame(classe)
  if (dim(vars)[1] > 0){df.vars <- dplyr::left_join(df.vars,vars,by=".id")}
  df.vars <- dplyr::left_join(df.vars,na,by=".id")
  df.vars <- dplyr::left_join(df.vars,nan,by=".id")
  df.vars <- dplyr::left_join(df.vars,distinct,by=".id")
  df.vars <- dplyr::left_join(df.vars,n.labels,by=".id")
  df.vars <- dplyr::left_join(df.vars,labels,by=".id")
  if (dim(vars)[1] > 0){
    names(df.vars) <- c('name','class','var','n.na','n.nan','n.distinct','n.labels','labels')
  } else {
    names(df.vars) <- c('name','class','n.na','n.nan','n.distinct','n.labels','labels')
  }
  if (n > 0){
    df.vars[,paste0('label.',1:n)] <- stringr::str_split_fixed(df.vars$labels,';',n)
  }

  return(NULL)

}

######################################
##### recode functions

dummy_all <- function(df=NULL,reg.exp=NULL,keep_all=TRUE){
  #' Creates an indicador variable for each category of the selected variables
  #'
  #' This function creates an indicador variable for each category of the selected variables
  #' that belong to classes \emph{factor} or \emph{character}. Variables are selected using a
  #' regular expression. The user can choose to keep all categories or to drop the reference
  #' category. The new indicathor variables are named with the category label/value - only
  #' alpha-numerical characters are kept, and all accents are removed. Empty categories (i.e.
  #' factor levels that are defined but not used) are dropped.
  #'
  #' @param df The dataframe containing the variables to be recoded.
  #' @param reg.exp A \emph{regular expression} identifying the variables that should be analysed.
  #' @param keep_all If \code{TRUE}, then for every category a dummy variable will be created. If
  #' \code{FALSE} then the reference category will be dropped.
  #' @return A \emph{(dataframe)} contaning only the dummy variables that were created.
  #'
  #' @examples
  #' df.dummies <- dummy_all(df,reg.exp='^P[0-9]+$',keep_all=TRUE)
  #'


  #dplyr::selecting variables
  df <- df %>% dplyr::select(matches(reg.exp))
  inds <- sapply(df,class) %in% c("factor","character")
  df <- df[,inds]
  if (sum(inds) > 0) warning("Only Factor and Character variables are kept!!!")

  #creating names
  df.names <- df %>% tidyr::gather(var,categ)
  df.names <- df.names %>% dplyr::group_by(var,categ) %>% dplyr::summarise(freq=n())
  df.names <- df.names %>% dplyr::group_by(var) %>% dplyr::mutate(n=row_number())
  df.names$orig <- paste0(df.names$var,df.names$categ)
  df.names$orig <- iconv(df.names$orig,from='UTF-8', to='ASCII//TRANSLIT')
  df.names$name <- str_replace_all(df.names$categ,'[^[:alnum:]]','.')
  df.names$name <- str_replace_all(df.names$name,'\\.+','.')
  df.names$name <- str_replace_all(df.names$name,'\\.$','')
  df.names$name <- iconv(df.names$name,from='UTF-8', to='ASCII//TRANSLIT')
  df.names$new <- paste0(df.names$var,".",df.names$n,"_",df.names$name)
  df.names <- df.names %>% ungroup() %>% dplyr::select(orig,new)

  #creating dataframe of dummies
  if (keep_all==TRUE){
    form <- paste0("~ -1 + ",paste(names(df),collapse = " + "))
    form <- as.formula(form)
    df.dummy <- as.data.frame(model.matrix(form,df,contrasts.arg = lapply(df, contrasts, contrasts=FALSE)))
  } else {
    form <- paste0("~ ",paste(names(df),collapse = " + "))
    form <- as.formula(form)
    df.dummy <- as.data.frame(model.matrix(form,df))
    df.dummy <- df.dummy[,-1]
  }

  #dropping empty variables
  inds <- unname(colSums(df.dummy,na.rm = TRUE) > 0)
  df.dummy <- df.dummy[,inds]
  names.orig <- iconv(names(df.dummy),to='ASCII//TRANSLIT')
  df.merge <- data.frame(orig=names.orig,order=1:length(names.orig))
  df.merge <- dplyr::left_join(df.merge,df.names,by="orig")
  df.merge <- df.merge %>% dplyr::arrange(order)
  if(sum(is.na(df.merge$new)) > 0) stop("Something went wrong with the df labels!")
  names(df.dummy) <- df.merge$new

  return(df.dummy)

}

recode_all <- function(df=NULL){
  #' space holder

}

######################################
##### segmentation - hierarquical + kmeans

#atualizar com o purrr e o broom
#https://cran.r-project.org/web/packages/broom/vignettes/kmeans.html

auto_cluster <- function(df=NULL,grps=3:6,name='kmeans',iter.max=100){
  #' Cluster Analysis combining Hierarquical and Kmeans cluster
  #'
  #' This function runs as Cluster Analysis. The first step is to run a Hierarquical Cluster,
  #' and then use centroids as starting point for the Kmeans Cluster. This function outputs the
  #' cluster ids for each line in the dataframe and a summary of the group sizes. Details of the
  #' analysis for each step are:
  #' \itemize{
  #'  \item \strong{Hierarquical}: Uses Euclidean distance and Ward's method. See more details
  #'   in \code{\link[stats]{hclust}}.
  #'  \item \strong{Kmeans}: Uses default options. See more details in \code{\link[stats]{kmeans}}.
  #' }
  #'
  #' @param df A \emph{dataframe} containing the variables to be used in the analysis.
  #' @param grps An \emph{array} or \emph{number} with the number of groups that should be created.
  #' @param name A \emph{string} with the name of the variables that will be created.
  #' @param iter.max A \emph{number} indicating the maximum number of iterations for the Kmeans cluster.
  #' @return A list with two components:
  #' \itemize{
  #'  \item \strong{grps}\emph{(dataframe)}: with the variables identifying the cluster each observation
  #'  belongs too.
  #'  \item \strong{vars}\emph{(dataframe)}: summary of the number of observations per cluster.
  #' }
  #' @examples
  #' df.cluster <- auto_cluster(df=df,grps=3:6,name='kmeans')
  #'

  df_ <- df

  d <- dist(df, method = "euclidean")
  hrq <- hclust(d, method="ward")

  for (i in grps){
    grp <- cutree(hrq, k=i)
    aux.df <- cbind(grp,df)
    aux.df <- aux.df %>% dplyr::group_by(grp) %>% dplyr::summarise_each(funs(mean))
    aux <- as.matrix(aux.df[,-1])

    #Cluster kmeans
    df_[,paste0(name,i)] <- kmeans(df, aux, iter.max = iter.max)$cluster
  }

  df_ <- df_ %>% dplyr::select(starts_with(name))
  df.seg <- df_ %>% tidyr::gather(var,seg) %>% dplyr::group_by(var,seg) %>% dplyr::summarise(n=n())
  df.seg <- as.data.frame(df.seg %>% tidyr::spread(var,n))

  return(list(grps=df_,summary=df.seg))
}

######################################
##### Factor analysis

factor_analysis <- function(df=NULL,n.fat=NULL,name="fator",sep=".",rotation="varimax",scores="regression",cut=0.2){
  #' Factor Analysis choosing number of factors automatically
  #'
  #' This function runs as Factor Analysis. The first step is to calculate the number of factors
  #' automatically. Once the number of factor is choosen, the Factor Analysis is run, and both
  #' the estimated factor variables and the loadings are calculated. Missing values aren't aloud. Details
  #' of the analysis for each step are:
  #' \itemize{
  #'  \item \strong{# of Factors}: Calculates the number of eigenvalues in the correlation matrix that
  #'  are larger then 1. The idea is that if the variance of a standardized variable is 1, then a factor
  #'  should only be maintained if it's variance is at least one. See more details in \code{\link[base]{eigen}}.
  #'  \item \strong{Factor Analysis}: The default options are \code{rotation='varimax'} and
  #'  \code{scores='regression'}. See more details in \code{\link[stats]{factanal}}.
  #' }
  #'
  #' @param df A \emph{dataframe} containing the variables to be used in the analysis.
  #' @param n.fat A \emph{number} with the number of factors to use. if \emph{NULL} then the
  #' number of factors is calculated automatically.
  #' @param name A \emph{string} with the name of the factor variables that will be created.
  #' @param sep A \emph{string} with the character to be used separating name of the factor
  #' from the number of the factor
  #' @param rotation A \emph{string} identifying the type of rotation to be performed.
  #' @param scores A \emph{string} identifying the type of scores to be extracted.
  #' @param cut A \emph{number} indicating the minimum size of factor loadings that should be kept.
  #' Loadings smaller then \emph{cut} will be dropped.
  #' @return A list with two components:
  #' \itemize{
  #'  \item \strong{factors}\emph{(dataframe)}: with the variables with the scores extrated from
  #'   the factor analysis.
  #'  \item \strong{loadings}\emph{(dataframe)}: with the extracted factor loadings.
  #'  \item \strong{details}\emph{(list)}: with all of the information returned from factanal.
  #' }
  #' @examples
  #' df_factan <- factor_analysis(df=df,cut=0.2)
  #'

  #number of factors
  if (is.null(n.fat)){
    n.fat <- eigen(cor(df,use = "pairwise.complete.obs"))
    n.fat <- sum(n.fat$values >= 1)
  }

  #if there are missings
  #cov.mat <- cov(df[,-1],use = "pairwise.complete.obs")
  #fit <- factanal(x=df[,-1], factors=n.fat, covmat=cov.mat, rotation="varimax",scores="regression")

  fit <- factanal(x=df, factors=n.fat, rotation=rotation,scores=scores)

  df.scores <- as.data.frame(fit$scores)
  names(df.scores) <- paste0(name,sep,1:n.fat)

  load <- as.data.frame(fit$loadings[])
  names(load) <- paste0(name,sep,1:n.fat)
  load$fator <- apply(abs(load),1,which.max)
  load$max <- apply(abs(load[,-dim(load)[2]]),1,max)
  load$var <- row.names(load)
  load <- load %>% tidyr::gather(factor,loading,-var,-fator,-max)
  load$loading <- ifelse(abs(load$loading) <= cut,NA,load$loading)
  load <- load %>% tidyr::spread(factor,loading)
  load <- load %>% dplyr::arrange(fator,-max)
  load <- load[,c('var','fator',paste0(name,sep,1:n.fat))]

  return(list(factors=df.scores,loadings=load,details=fit))
}

######################################
##### dplyr::summarise with totals / margin

tab_summary <- function(df=NULL,reg.exp_lin=NULL,reg.exp_col=NULL,wgt=NULL){
  #' Summary table is created, automatically adding marginal
  #'
  #' This function is still incomplete, for now it only does counts and column total. NEED TO UPDATE!
  #'
  #' @param df A \emph{dataframe} containing the variables to be used in the analysis.
  #' @param reg.exp_lin A \emph{regular expression} identifying the variables that should be used
  #' to create the \emph{lines} of the table.
  #' @param reg.exp_col A \emph{regular expression} identifying the variables that should be used
  #' to create the \emph{columns} of the table.
  #' @param wgt A \emph{string} identifying the weight variable. If \emph{NULL} weights are ignored.
  #' @return A \emph{(dataframe)} with the summary table.
  #' @examples
  #' df.tab <- tab_summary(df=df,reg.exp_lin='^P1[0-4]$',reg.exp_col='^P2[6-9]$',wgt=NULL)
  #'


  if (is.null(wgt)){
    df$peso <- 1
  } else {
    df$peso <- df[,wgt]
  }

  df.tab <- df %>% dplyr::select(peso,matches(reg.exp_lin),matches(reg.exp_col)) %>% tidyr::gather(var,categ,matches(reg.exp_lin)) %>% tidyr::gather(seg,grp,matches(reg.exp_col))
  df.tab <- df.tab %>% dplyr::group_by(seg,grp,var,categ) %>% dplyr::summarise(freq=sum(peso,na.rm = TRUE))
  df.tab <- df.tab %>% dplyr::group_by(seg,grp,var) %>% dplyr::mutate(freq=round(100*freq/sum(freq),1))
  df.tab <- df.tab %>% tidyr::unite(grp,seg,grp)
  df.tab <- df.tab %>% tidyr::spread(grp,freq)

  df.tot.lin <- df %>% dplyr::select(peso,matches(reg.exp_lin)) %>% tidyr::gather(var,categ,matches(reg.exp_lin))
  df.tot.lin <- df.tot.lin %>% dplyr::group_by(var,categ) %>% dplyr::summarise(total=sum(peso,na.rm = TRUE))
  df.tot.lin <- df.tot.lin %>% dplyr::group_by(var) %>% dplyr::mutate(total=round(100*total/sum(total),1))

  df.tab <- dplyr::left_join(df.tot.lin,df.tab,by=c("var","categ"))

  return(df.tab)
}

######################################
##### Raking fix


######################################
##### Soft merge - usando agrep e adist


######################################
##### Funções pra gerar apresentações
##### outro arquivo
neale-eldash/neale documentation built on May 23, 2019, 1:29 p.m.